library(tidyverse)
library(ipumsr)
library(srvyr)
library(haven)

Exercise 01 (1 point)

library(ipumsr)
library(tidyverse)
ddi <- read_ipums_ddi("cps_00002.xml")
data <- read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
glimpse(data)
## Rows: 212,608
## Columns: 26
## $ YEAR      <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, …
## $ SERIAL    <dbl> 1, 2, 2, 2, 3, 5, 5, 5, 6, 7, 7, 8, 9, 9, 11, 11, 12, 12, 14…
## $ MONTH     <int+lbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ HWTFINL   <dbl> 1796.811, 2923.587, 2923.587, 2923.587, 1792.269, 1912.517, …
## $ CPSID     <dbl> 2.02003e+13, 2.02002e+13, 2.02002e+13, 2.02002e+13, 2.02001e…
## $ PERNUM    <dbl> 1, 1, 2, 3, 1, 1, 2, 3, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, …
## $ WTFINL    <dbl> 1796.811, 2923.587, 3618.171, 4634.866, 1792.269, 1912.517, …
## $ CPSIDP    <dbl> 2.02003e+13, 2.02002e+13, 2.02002e+13, 2.02002e+13, 2.02001e…
## $ RELATE    <int+lbl> 101, 101, 301, 501, 101, 101, 1260, 1260, 101, 101, 202,…
## $ AGE       <int+lbl> 72, 21, 1, 27, 59, 67, 33, 33, 48, 80, 80, 80, 44, 19, 7…
## $ SEX       <int+lbl> 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2,…
## $ RACE      <int+lbl> 100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 1…
## $ MARST     <int+lbl> 4, 4, 9, 6, 5, 3, 4, 6, 6, 1, 1, 6, 6, 6, 1, 1, 1, 1, 4,…
## $ POPSTAT   <int+lbl> 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ EMPSTAT   <int+lbl> 10, 21, 0, 21, 10, 10, 34, 34, 10, 36, 10, 36, 10, 34, 1…
## $ LABFORCE  <int+lbl> 2, 2, 0, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1,…
## $ OCC       <dbl> 2205, 4720, 0, 710, 3255, 2040, 0, 0, 3930, 0, 4920, 0, 9645…
## $ IND       <dbl> 7870, 8680, 0, 4670, 8191, 9160, 0, 0, 4971, 0, 7071, 0, 629…
## $ UHRSWORKT <int+lbl> 40, 999, 999, 999, 40, 65, 999, 999, 997, 999, 997, 999,…
## $ UHRSWORK1 <int+lbl> 40, 999, 999, 999, 40, 65, 999, 999, 997, 999, 997, 999,…
## $ UHRSWORK2 <int+lbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 9…
## $ AHRSWORKT <dbl+lbl>  40, 999, 999, 999,  40,  40, 999, 999,  45, 999,  24, 9…
## $ AHRSWORK1 <int+lbl> 40, 999, 999, 999, 40, 40, 999, 999, 45, 999, 24, 999, 4…
## $ AHRSWORK2 <int+lbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 9…
## $ ABSENT    <int+lbl> 0, 2, 0, 2, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 3, 1, 0, 0, 1,…
## $ WKSTAT    <int+lbl> 11, 60, 99, 50, 11, 11, 99, 99, 11, 99, 41, 99, 11, 99, …

Exercise 02 (2 points)

#2.

library(srvyr)
cps_svy <- as_survey_design(data, weights = WTFINL)
class(data)
## [1] "tbl_df"     "tbl"        "data.frame"
class(cps_svy)
## [1] "tbl_svy"        "survey.design2" "survey.design"

#3. The difference is that, the as_survey_design function turn the tibble into a “tbl_svy”, object, and create two new catagories called “survey.design” & “survey.design2”

#4. Yes, there is a value of NIU: #for UHRSWORKT, 1999-onward ASEC:0 = No hours; 997 = Hours vary; 999 = Not in universe (NIU) #for AHRSWORKT, 999 = NIU (Not in universe)

NIU.UHR.AHR <- data %>%
  group_by(YEAR, UHRSWORKT, AHRSWORKT) %>%
  summarise(
    unweighted = n()
  ) %>%
  filter(UHRSWORKT == 999 | AHRSWORKT == 999) %>%
  arrange(YEAR, desc(UHRSWORKT))
NIU.UHR.AHR

#observed that for all observations in 2020 and 2021, while UHRSWORKT == 999, then AHRSWORKT is inevitabily 999, that means to sum up the observations that are not in the universe in these two years, we could only group_by the AHRSWORKT variable. Rewrite our code:

NIU.UHR.AHR <- data %>%
  group_by(YEAR, AHRSWORKT) %>%
  summarise(
    unweighted = n()
  ) %>%
  filter(AHRSWORKT == 999) %>%
  arrange(YEAR, desc(AHRSWORKT))
NIU.UHR.AHR

#Thus, in 2020, there were 63321 observations that are not in the universe for UHRSWORKT and AHRSWORKT; in 2021, there were 61703 of them.

#5.Filter out observations not in the universe for UHRSWORKT and create a new data frame called cps_subset_svy with the assignment operator.

cps_subset_svy <- data %>%
  filter(UHRSWORKT != 999)
cps_subset_svy

#6.Count the number of unweighted responses for each value for UHRSWORKT.

cps_subset_6 <- cps_subset_svy %>%
  group_by(UHRSWORKT) %>%
  filter(UHRSWORKT != 997) %>%
  summarize (
    unweighted = n()
  ) 
ggplot(data = cps_subset_6) + 
  geom_col(mapping = aes(x = UHRSWORKT, y = unweighted))

#7.Calculate the mean usual hours worked (UHRSWORKT) in 2020 and 2021. Exclude workers with “Hours vary”.

cps_subset_7 <- cps_subset_svy %>%
  group_by(YEAR) %>%
  filter(UHRSWORKT != 997) %>%
  summarise(mean_UHRSWORKT = mean(UHRSWORKT))
cps_subset_7

#8.Calculate the proportion of workers who usually worked exactly 40 hours in 2021.

cps_subset_8 <- cps_subset_svy %>%
  filter(YEAR == 2021) %>%
  mutate(
    HOURS_40 = if_else(condition = UHRSWORKT == 40, true = 1, false = 0)
    ) %>%
  group_by(YEAR) %>%
  summarise(proportion_40 = mean(HOURS_40))
cps_subset_8

#9.Calculate the proportion of workers who worked less, the same, and more than usual in April 2020 and April 2021 (separately) by comparing UHRSWORKT and AHRSWORKT.

cps_subset_91 <- cps_subset_svy %>%
  filter(UHRSWORKT != 997) %>%
  select(YEAR, UHRSWORKT, AHRSWORKT) %>%
  mutate(
    AprilWork = case_when(
      AHRSWORKT > UHRSWORKT ~ "work more",
      AHRSWORKT < UHRSWORKT ~ "work less",
      TRUE ~ "work the same"
    )
  ) 
cps_subset_92 <- as_survey_design(cps_subset_91) %>%
  group_by(YEAR, AprilWork) %>%
  summarise(prop = survey_prop())
cps_subset_92

Exercise 03 (2 points)

#2. Using cps_svy, calculate the weighted count of observations ages 16 or older for each year with survey_count(). Your results should match the April 2020 and April 2021 values to the closest thousand because of rounding. #Sorry I don’t have any idea about “survey_count”, it seems not appear when I was trying to search it. I also don’t know how to put the age >= 16 into weight.

#3. Create three numeric indicator variables.

cps_svy_33 <- cps_svy %>%
  mutate(
    labor_force = case_when(
      LABFORCE == 2 ~ 1,
      LABFORCE == 1 ~ 0,
      TRUE ~ 999
    )
  ) %>%
  mutate(
    employed = case_when(
      EMPSTAT == 10 | EMPSTAT == 12 ~ 1,
      EMPSTAT == 20 | EMPSTAT == 21 | EMPSTAT == 22 ~ 0,
      TRUE ~ 999
    )
  ) %>%
  mutate(
    unemployed = case_when(
      EMPSTAT == 10 | EMPSTAT == 12 ~ 0,
      EMPSTAT == 20 | EMPSTAT == 21 | EMPSTAT == 22 ~ 1,
      TRUE ~ 999
    )
  )

#4.Filter to the civilian population ages 16 or older, and calculate the relavant population.

cps_svy_34 <- cps_svy_33 %>%
  filter(AGE >= 16) %>%
  filter(POPSTAT == 1) %>%
  summarise(
    LABFOR_cal = survey_total(labor_force == 1),
    EMPLOY_cal = survey_total(employed == 1),
    UNEMPLOY_cal = survey_total(unemployed == 1)
            )
cps_svy_34

#5. Compare the result with microdata to the official tabulation for April 2021. #The official data is: civilian labor force - 160,379,000; employed - 151,160,000; unemployed - 9,220,000. #What I got here are (shown above): civilian labor force - 316,601,435; employed - 284,601,546; unemployed - 31,999,889. They are far from “close”, thus there might be some problems in my code.

Exercise 04 (2 points)

#1.-3

cps2021 <- filter(data, YEAR == 2021)
cps2021_sum <- cps2021 %>%
  summarise(n())
cps2021_sum
nber <- read_dta("cpsb202104.dta")
nber_sum <- nber %>%
  select(hufinal, pwcmpwgt) %>%
  summarise(n())
nber_sum

#3. Notice that n = 133,449. There are 133,449 observations in dataset “nber”, while there are 111,003 observations in dataset “cps2021”.

#4. As it shows, there are 111,003 observations in nber_44, which match with the amount of the observations in dataset “cps2021”.

nber_44 <- filter(nber, hufinal <= 205)
summarise(nber_44, n())

#5. Combine the columns.

nber_cps <- bind_cols(cps2021, nber_44)
nber_cps

#6.

nber_cps_svy <- nber_cps %>%
  mutate(pwcm_cal = pwcmpwgt/10000) %>%
  as_survey_design(weights = pwcm_cal) %>%
  mutate(
    labor_force = case_when(
      LABFORCE == 2 ~ 1,
      LABFORCE == 1 ~ 0,
      TRUE ~ 999
    )
  ) %>%
  mutate(
    employed = case_when(
      EMPSTAT == 10 | EMPSTAT == 12 ~ 1,
      EMPSTAT == 20 | EMPSTAT == 21 | EMPSTAT == 22 ~ 0,
      TRUE ~ 999
    )
  ) %>%
  mutate(
    unemployed = case_when(
      EMPSTAT == 10 | EMPSTAT == 12 ~ 0,
      EMPSTAT == 20 | EMPSTAT == 21 | EMPSTAT == 22 ~ 1,
      TRUE ~ 999
    )
  ) %>%
  filter(AGE >= 16) %>%
  filter(POPSTAT == 1) %>%
  summarise(
    LABFOR_cal = survey_total(labor_force == 1),
    EMPLOY_cal = survey_total(employed == 1),
    UNEMPLOY_cal = survey_total(unemployed == 1)
            )
nber_cps_svy

Exercise 05 (2 points)